library(gKRLS)
library(tidyverse)
library(haven)
library(mgcv)
library(tidygam)
library(hardhat)
library(gridExtra)
library(patchwork)
library(scales)
library(modelsummary)

# read in data
ncvs_data <- read_dta("NCVS 2023 Data.dta")

ncvs_data <- ncvs_data %>%
  filter(V3085 %in% c(1:8), V3086 %in% c(1:8)) %>% # remove interviews without questions about gender identity
  filter(round(YEARQ, 1) %in% c(2017.1, 2017.2, 2017.3, 2017.4, # remove years when gender identity was only 
                      2018.1, 2018.2, 2018.3, 2018.4, # asked of crime victims
                      2022.3, 2022.4, 2023.1, 2023.2,
                      2023.3, 2023.4)) %>%
  filter(V3014 >= 18) %>% # remove respondents younger than 18
  mutate(afab = if_else(V3085 == 2, 1, 0),
         gender_id = factor(case_when(afab == 1 & V3086 == 2 ~ "Non-trans woman",
                                      afab == 0 & V3086 == 1 ~ "Non-trans man",
                                      afab == 1 & V3086 %in% c(1,3) ~ "Transmasculine",
                                      afab == 0 & V3086 %in% c(2:3) ~ "Transfeminine",
                                      V3086 == 4 ~ "Gender nonconforming")),
         trans = factor(case_when(gender_id %in% c("Non-trans woman", "Non-trans man") ~ 0,
                                  gender_id %in% c("Transmasculine", "Transfeminine", "Gender nonconforming") ~ 1))) %>%
  mutate(deaf = case_when(V3_V4526H3A == 1 ~ 1,
                          V3_V4526H3A == 2 ~ 0),
         blind = case_when(V3_V4526H3B == 1 ~ 1,
                           V3_V4526H3B == 2 ~ 0),
         decide = case_when(V3_V4526H5 == 1 ~ 1,
                            V3_V4526H5 == 2 ~ 0),
         walk = case_when(V3_V4526H4 == 1 ~ 1,
                          V3_V4526H4 == 2 ~ 0),
         dress = case_when(V3_V4526H6 == 1 ~ 1,
                           V3_V4526H6 == 2 ~ 0),
         alone = case_when(V3_V4526H7 == 1 ~ 1,
                           V3_V4526H7 == 2 ~ 0),
         phys_dis = if_else(deaf + blind + walk + dress == 0,
                            0, 1),
         phys_dis_fac = factor(phys_dis,
                               levels = c(0,1),
                               labels = c("No", "Yes")),
       yearq_rounded = round(YEARQ, 1))

# remove duplicate rows due to longitudinal design
dup_rows <- which(duplicated(ncvs_data$IDPER))

ncvs_person <- ncvs_data[-dup_rows,]

ncvs_person <- ncvs_person %>%
  filter(!is.na(phys_dis)) %>% # remove people with missing data for disability
  mutate(birth_year = YEAR - V3014) %>%
  mutate(year_fac = factor(YEAR))

# normalize sampling weights
ncvs_person$norm_weights <- (ncvs_person$WGTPERCY)/mean((ncvs_person$WGTPERCY))

# additional data cleaning
ncvs_person$phys_dis_fac <- factor(ncvs_person$phys_dis_fac)

ncvs_person <- ncvs_person %>%
  mutate(ethnicity = if_else(V3024A == 1, "Hispanic", "Non-Hispanic"),
         race = case_when(V3023A == 1 ~ "White",
                          V3023A == 2 ~ "Black",
                          V3023A == 4 ~ "Asian",
                          V3023A %in% c(3,5) ~ "Other race",
                          V3023A %in% c(6:20) ~ "Multiracial"),
         edu = case_when(V3020 %in% c(1:11, 27) ~ "LTHS",
                         V3020 == 28 ~ "HS",
                         V3020 %in% c(40,41) ~ "Some college",
                         V3020 %in% c(42:45) ~ "College grad"),
         birth_year_binned = case_when(birth_year <= 1945 ~ "1945",
                                       birth_year <= 1955 ~ "1955",
                                       birth_year <= 1965 ~ "1965",
                                       birth_year <= 1975 ~ "1975",
                                       birth_year <= 1985 ~ "1985",
                                       birth_year <= 1995 ~ "1995",
                                       birth_year <= 2005 ~ "2005"),
         gender_id = factor(case_when(afab == 1 & V3086 == 2 ~ "Non-trans woman",
                                      afab == 0 & V3086 == 1 ~ "Non-trans man",
                                      afab == 1 & V3086 %in% c(1,3) ~ "Transmasculine",
                                      afab == 0 & V3086 %in% c(2:3) ~ "Transfeminine",
                                      V3086 == 4 ~ "Gender nonconforming")))

## Table A3
wtpct <- function(x, y) sum(x, na.rm = TRUE) / sum(y, na.rm = T) * 100

datasummary((`Birth cohort` = birth_year_binned) + (`Race` = race) + (`Ethnicity` = ethnicity) + (`Education` = edu) + (`Physical Disability` = phys_dis_fac) + 1 ~ (`Gender Identity` = gender_id)*(N + norm_weights*Percent(fn = wtpct, denom = "col")),
            data = ncvs_person,
            output = "ncvs_summary.html")

## Figure 1 model

# fit model
dis_fit <- gam(trans ~ year_fac + s(birth_year, by = phys_dis_fac, bs = "gKRLS"),
               family = binomial(link = "logit"),
               data = ncvs_person,
               weights = norm_weights,
               method = "REML")

# get var-covar matrix
robust <- vcovCL(dis_fit, cluster = ncvs_person$V2117)

## list years to compute predictions for
year_5_dbl <- c(1940, 1940, 1945, 1945, 1950, 1950, 1955, 1955, 1960, 1960, 1965, 1965, 1970, 1970, 1975,
                1975, 1980, 1980, 1985, 1985, 1990, 1990, 1995, 1995, 2000, 2000, 2002, 2002, 2003, 2003,
                2004, 2004, 2005, 2005)

## get predictions based on model
dis_effects <- calculate_effects(dis_fit,
                                 variables = c("birth_year"),
                                 conditional = data.frame("birth_year" = year_5_dbl,
                                                          "phys_dis_fac" = c("Yes", "No"),
                                                          "year_fac" = "2023"),
                                 vcov = robust,
                                 continuous_type = "predict",
                                 verbose = T)

write_csv(dis_effects, "ncvs dis_effects.csv")


## Figure 4

# fit model (afab)
dis_fit_afab <- gam(trans ~ year_fac + s(birth_year, by = phys_dis_fac, bs = "gKRLS"),
               family = binomial(link = "logit"),
               data = ncvs_person[which(ncvs_person$afab == 1),],
               weights = norm_weights,
               method = "REML")

# get var-covar matrix
robust_afab <- vcovCL(dis_fit_afab, cluster = ncvs_person[which(ncvs_person$afab == 1),]$V2117)

# fit model (amab)
dis_fit_amab <- gam(trans ~ year_fac + s(birth_year, by = phys_dis_fac, bs = "gKRLS"),
                    family = binomial(link = "logit"),
                    data = ncvs_person[which(ncvs_person$afab == 0),],
                    weights = norm_weights,
                    method = "REML")

# get var-covar matrix
robust_amab <- vcovCL(dis_fit_amab, cluster = ncvs_person[which(ncvs_person$afab == 0),]$V2117)

# get predictions (afab)
dis_effects_afab <- calculate_effects(dis_fit_afab,
                                 variables = c("birth_year"),
                                 conditional = data.frame("birth_year" = year_5_dbl,
                                                          "phys_dis_fac" = c("Yes", "No"),
                                                          "year_fac" = "2023"),
                                 vcov = robust_afab,
                                 continuous_type = "predict",
                                 verbose = T)

write_csv(dis_effects_afab, "ncvs dis_effects_afab.csv")

dis_effects_amab <- calculate_effects(dis_fit_amab,
                                      variables = c("birth_year"),
                                      conditional = data.frame("birth_year" = 1945,
                                                               "phys_dis_fac" = c("Yes", "No"),
                                                               "year_fac" = "2023"),
                                      vcov = robust_amab,
                                      continuous_type = "predict",
                                      verbose = T)

write_csv(dis_effects_amab, "ncvs dis_effects_amab.csv")


